
/* 

Paper: Gentrification and pioneer businesses 
Authors: Behrens, Boulam, Martin, Mayneris 

Name dofile: create_db_concordedblocks.do  
Version: 15 nov. 2021 

Output: id_conc_database.dta 

Inputs:

- naics02_NETSlabel.dta (publicly available)
- duns_1990_2010_idconc_naics.dta  (from NETS data)
- corres_block900010 (authors computation, publicly available)
- education.dta, housing.dta, income.dta, race.dta (from prepare_blockvar_*.dta datasets, publicly available)
- geography1990.dta, geography2000.dta, geography2010.dta (from Census files, publicly available)

*/ 



set more off

// put census variables together (non-concorded) block level 
use education, clear 
sort geoid year 
merge 1:1 geoid year using housing 
drop _m 
sort geoid year 
merge 1:1 geoid year using income 
drop _m 
merge 1:1 geoid year using race
drop pop_white_block_direct pop_black_block_direct pop_asian_block_direct
drop _m 


save census_variables.dta, replace 

// put geo variables together (non-concorded) block level 
local listvar geoid aland awater housing intptlat intptlon pop

use geography1990, clear 
keep geoid aland awater housing intptlat intptlon pop
foreach i in `listvar' {
rename `i'90 `i'
}
g year=1990 
saveold geo1990.dta, replace v(13) 

use geography2000, clear 
keep geoid aland awater housing intptlat intptlon pop
foreach i in `listvar' {
rename `i'00 `i'
}
g year=2000 
saveold geo2000.dta, replace v(13) 

use geography2010, clear 
keep geoid aland awater housing intptlat intptlon pop
foreach i in `listvar' {
rename `i'10 `i'
}
g year=2010 
saveold geo2010.dta, replace v(13) 
 

use geo1990, clear
append using geo2000 geo2010 

*While geoid should be 14 or 15 character long, 
*it is 12 or 13 character long for many blocks in 1990.
*We have a procedure to correct for that

gen var=substr(geoid,1,9) if year==1990 & length(geoid)<14
gen var2=substr(geoid, 10,4) if year==1990 & length(geoid)<14
replace geoid=var+"00"+var2 if year==1990 & length(geoid)<14
drop var var2

save geo_variables, replace 

// create a dataset with stable geographic characteristics at the concorded block level
use geo_variables, clear
keep geoid year aland awater intptlat intptlon
sort geoid year
merge 1:1 geoid year using corres_block900010.dta
tab year if _m==2
*31 895 _m==2: some blocks are in the correspondance database but not in the geodatabase that comes from the Census
*most of them are observations from 1990
*keep if _m==3  
drop _m 
label var id_conc concorded_block
keep if year>1990 // geographic information in 1990 is not perfectly reliable: we decide to compute concorded block geographic information as an average of the characeristics in 2000 and 2010

collapse (sum) aland awater (mean) intptlat intptlon, by(id_conc year)
collapse (mean) aland awater intptlat intptlon, by(id_conc)
sort id_conc

save geo_variables_conc_block, replace 

// create the census variables at the concorded block level and merge with stable geographic information
use geo_variables, clear
keep geoid year housing pop

sort geoid year
merge 1:1 geoid year using corres_block900010.dta
drop _m

sort geoid year
merge 1:1 geoid year using census_variables.dta
drop _m 

foreach v of var median_yearb_block median_crent_block median_grent_block median_value_block  {
bys id_conc year: egen housing_tot=sum(housing) if (`v'>0&`v'!=.)
gen sh=housing/housing_tot
replace `v'=sh*`v'
drop sh housing_tot
}

foreach v of var median_income_block percap_income_block {
bys id_conc year: egen pop_tot=sum(pop) if `v'!=.
gen sh=pop/pop_tot
replace `v'=sh*`v'
drop sh pop_tot
}

foreach v of var pop pop_* housing edu_college_block median_yearb_block median_crent_block median_grent_block median_value_block  median_income_block percap_income_block{
gen miss_`v'= `v'!=. // we identify conc_blocks which have only missing values for each variable, so as to replace the information as missing after the collapse
}

collapse (sum) pop pop_* housing edu_college_block median_* percap_income_block miss_*, by(id_conc year) 
 
foreach v of var pop pop_* edu_college_block median_yearb_block median_crent_block median_grent_block median_value_block  median_income_block percap_income_block{
replace `v'=. if miss_`v'==0
}

foreach v of var edu_college_block pop_* {
replace `v'=0 if pop==0 // when pop==0, the other variables regarding pop composition are also 0, and not .
}

drop miss_*

merge m:1 id_conc using  geo_variables_conc_block
drop _m

saveold id_conc_database.dta, replace 


erase census_variables.dta
erase geo_variables_conc_block.dta

// deal with NETS data 
use duns_1990_2010_idconc_naics.dta, clear
keep if year==1990|year==2000|year==2010
gen t=1 if year==1990
replace t=2 if year==2000
replace t=3 if year==2010
gen nbplants=1
collapse (sum) nbplants emp, by (id_conc t)
egen id=group(id_conc t)
save temp, replace 


// merge establishment data with demographic, econ and geographical variables
use id_conc_database, clear
rename year year_census
egen t=group(year)
merge 1:1 id_conc t using temp 
erase temp.dta
drop if _m==2

foreach v of var emp* nbplants* {
replace `v'=0 if _m==1
}

drop _m 

gen lat_ws=40.706154
gen lon_ws=-74.008794

gen dis_ws = acos(sin((intptlat*_pi)/180) * sin((lat_ws*_pi)/180) + cos(abs((intptlon*_pi)/180- (lon_ws*_pi)/180)) * cos(intptlat*_pi/180) * cos(lat_ws*_pi/180)) * 6378.137

drop lat_ws lon_ws
save id_conc_database.dta, replace 




